################################################################################
## Group Identities and Parliamentary Debates: Replication package
## Fiva, Nedregård and Øien (2025)

# Description:

## Code to make Table A3 and A5: "Summary statistics by speaker-session"
## and Summary statistics by speaker characteristic

################################################################################

# Packages

library(data.table)
library(lubridate)
library(xtable)
library(dplyr)

# Directories (wd is set by master.R)
dir           <- "../data/2_processed_data"
tab.dir       <- "../results/tables"
in_text       <- "../results/in_text"
raw.dir       <- "../data/1_raw_data"

# Data

## Data to summarise

d <- fread(paste(dir, "speeches_session_lemma.csv", sep = "/"))



## Make variables

d[, n.words := n_words_only]
d[, words_per_minute := sum.min.words/sum.min] # Use only the number of words in speeches where we have minutes.
d[, w_per_s := n.words/n.speeches]



####################################################################

###### DESCRIPTIVE STATISTICS WHOLE SAMPLE

###################################################################


summary.function <- function(df, summary.var, labels){
  sum.df <- vector("list", length = length(summary.var))
  df     <- as.data.frame(df)
  for (v in seq_along(summary.var)){
    median.v <- median(df[, summary.var[v]], na.rm = T)
    mean.v   <- mean(df[, summary.var[v]], na.rm = T)
    sd.v     <- sd(df[, summary.var[v]], na.rm = T)
    min.v    <- min(df[, summary.var[v]], na.rm = T)
    max.v    <- max(df[, summary.var[v]], na.rm = T)
    n.v      <- dim(df[!is.na(df[, summary.var[v]]),])[1]
    
    sum.v       <- round(c("median" = median.v, "mean" = mean.v, 
                           "sd" =  sd.v,  "min" =  min.v, "max" = max.v, "N" = n.v), 2)
    
    sum.v       <- c("variable" = labels[v], sum.v)
    
    sum.df[[v]] <- sum.v
    
  }
  return(do.call(rbind, sum.df))
}


summary.var <- c("n.words", "n.speeches", "w_per_s", "sum.min", "words_per_minute")
labels      <- c("Number of words", "Number of speeches", "Words per speech", "Minutes of speech", "Words per minute")



## number of unique speakers and number of observations with minutes:

writeLines(as.character(format(dim(d[!is.na(words_per_minute)])[1], big.mark = ",")), 
           paste(in_text, "n_speaker_session_minutes.txt", sep = "/"))

writeLines(as.character(format(length(unique(d[!is.na(words_per_minute)]$candidatename_ed)), big.mark = ",")), 
           paste(in_text, "n_unique_speaker_minutes.txt", sep = "/"))

tab.sum <- summary.function(d, summary.var = summary.var, labels = labels)



z <- xtable(tab.sum, method = c("compact"), booktabs = T)

## Writing the median number of words, to be used in magnitude plot

writeLines(format(as.numeric(tab.sum[tab.sum[,1] == "Words per minute", "median"]), trim = T), 
           paste(in_text, "median_words_per_minute.txt", sep = "/"))


## Writing the summary numbers used in text:

writeLines(format(as.numeric(tab.sum[tab.sum[,1] == "Number of speeches", "mean"]), trim = T, digits = 1), 
           paste(in_text, "on_average_a_legislator_speaks.txt", sep = "/"))

writeLines(format(as.numeric(tab.sum[tab.sum[,1] == "Number of words", "mean"]), trim = T, digits = 1, big.mark = ","), 
           paste(in_text, "on_average_a_legislator_utters_roughly.txt", sep = "/"))

print.xtable(z, only.contents = T, comment = F, hline.after = NULL, 
             include.colnames = F, include.rownames = F, 
             file = paste(tab.dir, "tabA3.tex", sep = "/"))



####################################################################

###### DESCRIPTIVE STATISTICS BY BACKGROUND CHARACTERISTIC

###################################################################


# Summary function by group

summary_by_c <- function(df, background.var, sum_var, label_var){
  
  sum_results <- vector("list", length = length(sum_var))
  
  
  for (v in sum_var){   
    
    sum.dt <- df[, .(median = as.double(median(get(v), na.rm = T)),
                     mean   = mean(get(v),   na.rm = T),
                     sd     = sd(get(v),     na.rm = T),
                     min    = min(get(v),    na.rm = T),
                     max    = max(get(v),    na.rm = T),
                     n      = sum(!is.na(get(v)))          ), by = eval(background.var)]
    
    var.to.round <- c("median", "mean", "sd", "min", "max", "n")
    sum.dt[, (var.to.round) := lapply(.SD, function(x) round(x,1)), .SDcols = var.to.round]
    
    sum.dt <- as.data.frame(sum.dt)
    
    sum.dt <- sum.dt[!is.na(sum.dt[, background.var]),]
    
    ch <- unique(sum.dt[, background.var])
    ch <- ch[ch != ""]
    
    if ("kvinne" %in% ch){
      ch <- c("kvinne", "mann")
    }
    
    if ("H" %in% ch){
      ch <- c("H", "V")
    }
    
    if ("old" %in% ch){
      ch <- c("old", "young")
    }
    
    if ("white" %in% ch){
      ch <- c("white", "other")
    }
    
    a <- sum.dt[sum.dt[, background.var] == ch[1],]
    b <- sum.dt[sum.dt[, background.var] == ch[2],]
    
    a[1, background.var] <- background.var
    names_new <- c("var.name", paste(names(a)[-1], ch[1], sep = "_"))
    names(a) <- names_new
    
    b[, background.var] <- NULL
    sum.dt <- cbind(a, b)
    
    sum.dt$var.name <- label_var[which(sum_var == v)]
    
    sum_results[[which(sum_var == v)]] <- sum.dt
  }
  return(rbindlist(sum_results))
}

#--------------------BLOC----------------------------------------------###

H.sum <- summary_by_c(df = d, background.var = "bloc", sum_var = summary.var, label_var = labels)


## Setting decimal points
column_names <- colnames(H.sum)

digits <- rep(1, ncol(H.sum) + 1)

# Set digits to 0 for specified columns
columns_to_zero <- c("n_H", "n")
digits[match(columns_to_zero, column_names) + 1] <- 0 


zH <- xtable(H.sum, method = c("compact"), booktabs = T, digits = digits)

print.xtable(zH, only.contents = T, comment = F, hline.after = NULL, include.colnames = F, 
             include.rownames = F, file = paste(tab.dir, "/tabA5a.tex", sep = "/"))

#--------------------GENDER----------------------------------------------###

d[, female2 := case_when(female == 1 ~ "kvinne", T ~ "mann")]


ton.sum.kvinner <- summary_by_c(df = d, background.var = "female2", sum_var = summary.var, label_var = labels)

column_names <- colnames(ton.sum.kvinner)

digits <- rep(1, ncol(ton.sum.kvinner) + 1)

# Set digits to 0 for specified columns
columns_to_zero <- c("n_kvinne", "n")
digits[match(columns_to_zero, column_names) + 1] <- 0 


z <- xtable(ton.sum.kvinner, method = c("compact"), booktabs = T, digits = digits)


print.xtable(z, only.contents = T, comment = F, hline.after = NULL, include.colnames = F, 
             include.rownames = F, file = paste(tab.dir, "tabA5b.tex", sep = "/"))



#--------------------AGE----------------------------------------------###

age.sum <- summary_by_c(df = d, background.var = "age_cat", sum_var = summary.var,
                        label_var = labels)

## Setting decimal points

column_names <- colnames(age.sum)

digits <- rep(1, ncol(age.sum) + 1)

# Set digits to 0 for specified columns
columns_to_zero <- c("n_old", "n")
digits[match(columns_to_zero, column_names) + 1] <- 0 

z <- xtable(age.sum, method = c("compact"), booktabs = T, digits = digits)

print.xtable(z, only.contents = T, comment = F, hline.after = NULL, include.colnames = F, 
             include.rownames = F, file = paste(tab.dir, "tabA5c.tex", sep = "/"))


#--------------------RURALITY----------------------------------------------###

town.sum <- summary_by_c(df = d, background.var = "town", sum_var = summary.var, 
                         label_var = labels)


## Setting decimal points
column_names <- colnames(town.sum)

digits <- rep(1, ncol(town.sum) + 1)

# Set digits to 0 for specified columns
columns_to_zero <- c("n_urban", "n")
digits[match(columns_to_zero, column_names) + 1] <- 0 

z <- xtable(town.sum, method = c("compact"), booktabs = T, digits = digits)




print.xtable(z, only.contents = T, comment = F, hline.after = NULL, include.colnames = F, 
             include.rownames = F, file = paste(tab.dir, "tabA5d.tex", sep = "/"))


#--------------------OCCUPATION----------------------------------------------###

occupation.sum <- summary_by_c(df = d, background.var = "occupation", sum_var = summary.var, label_var = labels)

## Setting decimal points
column_names <- colnames(occupation.sum)

digits <- rep(1, ncol(occupation.sum) + 1)

# Set digits to 0 for specified columns
columns_to_zero <- c("n_white", "n")
digits[match(columns_to_zero, column_names) + 1] <- 0 


z <- xtable(occupation.sum, method = c("compact"), booktabs = T, digits = 1)

print.xtable(z, only.contents = T, comment = F, hline.after = NULL, include.colnames = F, 
             include.rownames = F, file = paste(tab.dir, "tabA5e.tex", sep = "/"))







